## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_logical(),
## X1 = col_double()
## )
## See spec(...) for full column specifications.
*This will show us a map of the whole world, with the countries coloured based on their most common topic of study
pl <- ggplot() +
geom_polygon(data = world_data2, aes(x = long, y = lat, group = group, fill = log(Count),text=Count)) +
#geom_point(data=data,aes(x = long, y = lat),alpha=0.5,size=0.5,colour="grey")+
coord_fixed(1.3)+
scale_fill_viridis()+
theme_void()
## Warning: Ignoring unknown aesthetics: text
ggplotly(pl,tooltip = "text")
## R Markdown
###get only europe
filt_bbox <- sf::st_bbox(c(xmin = -9,
ymin = 36,
xmax = 42.5,
ymax = 70.1),
crs = st_crs(4326)) %>%
sf::st_as_sfc(.)
find_data <- sf::st_within(world_sf, filt_bbox)
## although coordinates are longitude/latitude, st_within assumes that they are planar
#> although coordinates are longitude/latitude, st_within assumes that they are planar
europe_sf <- world_sf[which(lengths(find_data) != 0), ]
europe_result <- st_within(point_sf, europe_sf, sparse = FALSE)
## although coordinates are longitude/latitude, st_within assumes that they are planar
# Calculate the total count of each polygon
# Store the result as a new column "Count" in world_sf
europe_sf <- europe_sf %>%
mutate(Count = apply(europe_result, 2, sum))
# Convert world_sf to a data frame world_df
europe_df <- europe_sf
st_geometry(europe_df) <- NULL
# Get world data frame
world_data <- map_data("world")
# Merge world_data and world_df
europe_data <- europe_df %>%
left_join(world_data, by = c("region"))
ind <- sf::st_intersects(point_sf, europe_sf)
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
points_europe<- point_sf[which(lengths(ind) != 0), ]
points_europe <- cbind(points_europe,st_coordinates(points_europe))
points_europe=points_europe[,-c(6,7)]
*Now we are plotting the number of papers that come out of each country in europe and also adding in the locations of the insitutions.
europe_data=read.csv("https://raw.githubusercontent.com/dy-lin/hs19-trends/master/workshop/data/europe_data.csv")
points_europe=read.csv("https://raw.githubusercontent.com/dy-lin/hs19-trends/master/workshop/data/europe_points.csv")
text=paste(europe_data$region,europe_data$Count, sep=";")
pl <- ggplot() +
geom_polygon(data = europe_data, aes(x = long, y = lat, group = group, fill = log(Count))) +
geom_point(data=points_europe,aes(x=X,y=Y,text=str_wrap(affiliation,50)),alpha=0.5,size=0.5,colour="grey")+
coord_fixed(1.3)+
scale_fill_viridis()+
theme_void()
## Warning: Ignoring unknown aesthetics: text
ggplotly(pl,tooltip="text")
Let’s load our packages
library(tidyverse)
library(tidytext)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(widyr)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following object is masked from 'package:rgeos':
##
## union
## The following object is masked from 'package:plotly':
##
## groups
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggplot2)
library(ggraph)
##
## Attaching package: 'ggraph'
## The following object is masked from 'package:sp':
##
## geometry
library(readr)
library(tidygraph)
##
## Attaching package: 'tidygraph'
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:stats':
##
## filter
Load in the CSV, see what topic options we have available
df=read.csv("https://raw.githubusercontent.com/dy-lin/hs19-trends/master/workshop/data/bigrams.csv")
print(kable(unique(df$topic)))
##
##
## |x |
## |:---------------------|
## |Assembly |
## |Databases |
## |Epigenetics |
## |Gene Expression |
## |Genome Annotation |
## |Phylogenetics |
## |Sequence Alignment |
## |Sequencing |
## |Structural Prediction |
## |Variant Calling |
*This takes the bigram frequency determination and plotting and wraps it in one function, visualize_bigrams().The plotting starts on line 228
visualize_bigrams <- function(df_name, textfield, topic_title){
# Create frequencies of bigrams
df_cleaned <- df_name %>%
mutate(textfield_clean = removeWords(gsub("[^A-Za-z0-9 ]", "", {{textfield}}), stop_words$word))
df_bigrams <- df_cleaned %>%
unnest_tokens(bigrams, textfield_clean, token = "ngrams", n = 2)
df_freq <- as.data.frame(table(df_bigrams$bigrams)) %>%
arrange(desc(Freq))
# Visualizations
df_top_bigrams <- df_freq %>%
top_n(100, Freq) %>%
separate(Var1, c("word1", "word2"))
top_bigram_words <- c(df_top_bigrams$word1, df_top_bigrams$word2) %>%
unique()
word_list <- df_cleaned %>%
unnest_tokens(words, textfield_clean, token = "ngrams", n = 1)
df_word_list <- as.data.frame(table(word_list$words)) %>%
arrange(desc(Freq)) %>%
filter(Var1 %in% top_bigram_words)
names(df_word_list)[2] <- "Term_Frequency"
names(df_top_bigrams)[3] <- "Edge_Frequency"
graph_from_data_frame(vertices = df_word_list, d = df_top_bigrams) -> graph_hold
pl <- graph_hold %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = Edge_Frequency), show.legend = TRUE) +
geom_node_point(aes(color = Term_Frequency, size = Term_Frequency), alpha = 0.7) +
scale_fill_viridis_c() +
geom_node_text(aes(label = name), repel = TRUE) +
scale_color_viridis_c(direction = -1) +
theme_void() +
guides(size=FALSE) +
labs(title = quo_name(topic_title)) +
theme(plot.title = element_text(size = 26, face = "bold"))
#ggsave(pl,filename = paste0("../figures/", "bigrams_", str_to_lower(str_replace(topic_title, "\\s", "_")), ".png")
#,width = 12
#,height = 8)
pl
}
Now that we have the function made, decide on a topic and make a bigram digram for those topics
df_assembly <- df %>%
filter(topic == "Assembly")
visualize_bigrams(df_assembly, abstract, "")
## Warning: Factor `journal` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `affiliation` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `abstract` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `journal` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `affiliation` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `abstract` contains implicit NA, consider using
## `forcats::fct_explicit_na`